import pandas as pd
df1 = pd.read_csv('/home/asyabani/vehicles.csv')
df2 = pd.read_csv('/home/asyabani/labelencoder2.csv')
df2d = pd.read_csv('/home/asyabani/pddummies2.csv')
df3 = pd.read_csv('/home/asyabani/clean_dataset.csv')
df4 = pd.read_csv('/home/asyabani/clean_dataset2.csv')
dfm = pd.read_csv('/home/asyabani/cars_model_matches_msrp.csv')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.simplefilter(action='ignore')
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 url 426880 non-null object 2 region 426880 non-null object 3 region_url 426880 non-null object 4 price 426880 non-null int64 5 year 425675 non-null float64 6 manufacturer 409234 non-null object 7 model 421603 non-null object 8 condition 252776 non-null object 9 cylinders 249202 non-null object 10 fuel 423867 non-null object 11 odometer 422480 non-null float64 12 title_status 418638 non-null object 13 transmission 424324 non-null object 14 VIN 265838 non-null object 15 drive 296313 non-null object 16 size 120519 non-null object 17 type 334022 non-null object 18 paint_color 296677 non-null object 19 image_url 426812 non-null object 20 description 426810 non-null object 21 county 0 non-null float64 22 state 426880 non-null object 23 lat 420331 non-null float64 24 long 420331 non-null float64 25 posting_date 426812 non-null object dtypes: float64(5), int64(2), object(19) memory usage: 84.7+ MB
df1.describe()
| id | price | year | odometer | county | lat | long | |
|---|---|---|---|---|---|---|---|
| count | 4.268800e+05 | 4.268800e+05 | 425675.000000 | 4.224800e+05 | 0.0 | 420331.000000 | 420331.000000 |
| mean | 7.311487e+09 | 7.519903e+04 | 2011.235191 | 9.804333e+04 | NaN | 38.493940 | -94.748599 |
| std | 4.473170e+06 | 1.218228e+07 | 9.452120 | 2.138815e+05 | NaN | 5.841533 | 18.365462 |
| min | 7.207408e+09 | 0.000000e+00 | 1900.000000 | 0.000000e+00 | NaN | -84.122245 | -159.827728 |
| 25% | 7.308143e+09 | 5.900000e+03 | 2008.000000 | 3.770400e+04 | NaN | 34.601900 | -111.939847 |
| 50% | 7.312621e+09 | 1.395000e+04 | 2013.000000 | 8.554800e+04 | NaN | 39.150100 | -88.432600 |
| 75% | 7.315254e+09 | 2.648575e+04 | 2017.000000 | 1.335425e+05 | NaN | 42.398900 | -80.832039 |
| max | 7.317101e+09 | 3.736929e+09 | 2022.000000 | 1.000000e+07 | NaN | 82.390818 | 173.885502 |
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177414 entries, 0 to 177413 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 177414 non-null int64 1 year 177414 non-null float64 2 condition 177414 non-null int64 3 cylinders 177414 non-null int64 4 fuel 177414 non-null int64 5 odometer 177414 non-null float64 6 title_status 177414 non-null int64 7 transmission 177414 non-null int64 8 drive 177414 non-null int64 9 type 177414 non-null int64 10 state 177414 non-null int64 11 MSRP 177414 non-null float64 12 quarter 177414 non-null int64 13 car_age 177414 non-null float64 14 is_vintage 177414 non-null int64 15 is_color_neutral 177414 non-null int64 dtypes: float64(4), int64(12) memory usage: 21.7 MB
df2.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.774140e+05 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 1.774140e+05 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.0 | 177414.000000 | 177414.000000 | 177414.000000 |
| mean | 6.584987e+04 | 2011.359228 | 0.963802 | 2.363246 | 2.029840 | 1.027614e+05 | 0.138670 | 0.096334 | 0.631495 | 5.565265 | 23.734491 | 33552.674473 | 2.0 | 9.640772 | 0.005067 | 0.491308 |
| std | 1.299463e+07 | 7.175836 | 0.971358 | 1.226768 | 0.451924 | 6.607392e+04 | 0.759142 | 0.355112 | 0.702337 | 4.301956 | 14.982771 | 14626.607756 | 0.0 | 7.175836 | 0.071004 | 0.499926 |
| min | 0.000000e+00 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2000.000000 | 2.0 | -1.000000 | 0.000000 | 0.000000 |
| 25% | 4.500000e+03 | 2008.000000 | 0.000000 | 1.000000 | 2.000000 | 5.585350e+04 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 24340.000000 | 2.0 | 5.000000 | 0.000000 | 0.000000 |
| 50% | 9.995000e+03 | 2013.000000 | 1.000000 | 3.000000 | 2.000000 | 9.919550e+04 | 0.000000 | 0.000000 | 1.000000 | 8.000000 | 23.000000 | 30225.000000 | 2.0 | 8.000000 | 0.000000 | 0.000000 |
| 75% | 1.899800e+04 | 2016.000000 | 2.000000 | 3.000000 | 2.000000 | 1.406590e+05 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 41800.000000 | 2.0 | 13.000000 | 0.000000 | 1.000000 |
| max | 3.736929e+09 | 2022.000000 | 5.000000 | 6.000000 | 4.000000 | 1.365000e+06 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 319995.000000 | 2.0 | 121.000000 | 1.000000 | 1.000000 |
df2d.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177414 entries, 0 to 177413 Columns: 102 entries, price to state_wy dtypes: float64(4), int64(98) memory usage: 138.1 MB
df2d.describe()
| price | year | odometer | MSRP | quarter | car_age | is_vintage | is_color_neutral | condition_excellent | condition_fair | ... | state_sd | state_tn | state_tx | state_ut | state_va | state_vt | state_wa | state_wi | state_wv | state_wy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.774140e+05 | 177414.000000 | 1.774140e+05 | 177414.000000 | 177414.0 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | ... | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 | 177414.000000 |
| mean | 6.584987e+04 | 2011.359228 | 1.027614e+05 | 33552.674473 | 2.0 | 9.640772 | 0.005067 | 0.491308 | 0.425389 | 0.242664 | ... | 0.002897 | 0.027089 | 0.055204 | 0.001809 | 0.029654 | 0.006877 | 0.027771 | 0.025336 | 0.001674 | 0.000969 |
| std | 1.299463e+07 | 7.175836 | 6.607392e+04 | 14626.607756 | 0.0 | 7.175836 | 0.071004 | 0.499926 | 0.494403 | 0.428695 | ... | 0.053748 | 0.162344 | 0.228379 | 0.042498 | 0.169631 | 0.082640 | 0.164317 | 0.157145 | 0.040881 | 0.031122 |
| min | 0.000000e+00 | 1900.000000 | 0.000000e+00 | 2000.000000 | 2.0 | -1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4.500000e+03 | 2008.000000 | 5.585350e+04 | 24340.000000 | 2.0 | 5.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 9.995000e+03 | 2013.000000 | 9.919550e+04 | 30225.000000 | 2.0 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.899800e+04 | 2016.000000 | 1.406590e+05 | 41800.000000 | 2.0 | 13.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 3.736929e+09 | 2022.000000 | 1.365000e+06 | 319995.000000 | 2.0 | 121.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 102 columns
df3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 121001 entries, 0 to 121000 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 121001 non-null int64 1 condition 121001 non-null int64 2 cylinders 121001 non-null int64 3 fuel 121001 non-null int64 4 odometer 121001 non-null int64 5 title_status 121001 non-null int64 6 transmission 121001 non-null int64 7 drive 121001 non-null int64 8 type 121001 non-null int64 9 state 121001 non-null int64 10 MSRP 121001 non-null int64 11 quarter 121001 non-null int64 12 car_age 121001 non-null int64 13 is_vintage 121001 non-null int64 14 is_color_neutral 121001 non-null int64 15 year_sin 121001 non-null float64 16 year_cos 121001 non-null float64 dtypes: float64(2), int64(15) memory usage: 15.7 MB
df3.describe()
| price | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | year_sin | year_cos | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.0 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 | 121001.000000 |
| mean | 13667.702250 | 0.992372 | 2.148759 | 2.044958 | 106234.000017 | 0.165321 | 0.094239 | 0.655928 | 5.718044 | 23.679556 | 31342.145619 | 2.0 | 9.677110 | 0.003992 | 0.502087 | -0.033162 | 0.999255 |
| std | 9489.376825 | 0.960053 | 1.183678 | 0.380880 | 54150.141673 | 0.827767 | 0.347463 | 0.666707 | 4.277790 | 14.875976 | 10027.367255 | 0.0 | 6.351184 | 0.063054 | 0.499998 | 0.019686 | 0.001303 |
| min | 1050.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2130.000000 | 2.0 | 0.000000 | 0.000000 | 0.000000 | -0.355610 | 0.934634 |
| 25% | 6500.000000 | 0.000000 | 1.000000 | 2.000000 | 66311.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 24340.000000 | 2.0 | 5.000000 | 0.000000 | 0.000000 | -0.043490 | 0.999054 |
| 50% | 11300.000000 | 1.000000 | 3.000000 | 2.000000 | 104739.000000 | 0.000000 | 0.000000 | 1.000000 | 8.000000 | 23.000000 | 29735.000000 | 2.0 | 8.000000 | 0.000000 | 1.000000 | -0.027963 | 0.999609 |
| 75% | 18550.000000 | 2.000000 | 3.000000 | 2.000000 | 143348.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 37700.000000 | 2.0 | 13.000000 | 0.000000 | 1.000000 | -0.018643 | 0.999826 |
| max | 89999.000000 | 5.000000 | 6.000000 | 4.000000 | 267842.000000 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 115400.000000 | 2.0 | 116.000000 | 1.000000 | 1.000000 | -0.003107 | 0.999995 |
df4.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 120987 entries, 0 to 120986 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 120987 non-null int64 1 year 120987 non-null int64 2 condition 120987 non-null int64 3 cylinders 120987 non-null int64 4 fuel 120987 non-null int64 5 odometer 120987 non-null int64 6 title_status 120987 non-null int64 7 transmission 120987 non-null int64 8 drive 120987 non-null int64 9 type 120987 non-null int64 10 state 120987 non-null int64 11 MSRP 120987 non-null int64 12 car_age 120987 non-null int64 13 is_vintage 120987 non-null int64 14 is_color_neutral 120987 non-null int64 dtypes: int64(15) memory usage: 13.8 MB
df4.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 | 120987.000000 |
| mean | 13670.109541 | 2011.326804 | 0.992214 | 2.148198 | 2.044947 | 106201.070322 | 0.165307 | 0.094242 | 0.655823 | 5.716226 | 23.679875 | 31328.920570 | 9.673196 | 0.003992 | 0.502211 |
| std | 9474.141538 | 6.351591 | 0.960172 | 1.183581 | 0.380860 | 54157.883320 | 0.827739 | 0.347496 | 0.666552 | 4.278554 | 14.876898 | 9997.938409 | 6.351591 | 0.063058 | 0.499997 |
| min | 1050.000000 | 1905.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2130.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 6500.000000 | 2008.000000 | 0.000000 | 1.000000 | 2.000000 | 66246.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 24340.000000 | 5.000000 | 0.000000 | 0.000000 |
| 50% | 11380.000000 | 2013.000000 | 1.000000 | 3.000000 | 2.000000 | 104680.000000 | 0.000000 | 0.000000 | 1.000000 | 8.000000 | 23.000000 | 29735.000000 | 8.000000 | 0.000000 | 1.000000 |
| 75% | 18588.000000 | 2016.000000 | 2.000000 | 3.000000 | 2.000000 | 143300.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 37590.000000 | 13.000000 | 0.000000 | 1.000000 |
| max | 89950.000000 | 2021.000000 | 5.000000 | 6.000000 | 4.000000 | 267842.000000 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 102100.000000 | 116.000000 | 1.000000 | 1.000000 |
# Add log price for visualization purpose
df1['log price']=np.log(df1['price'])
sns.distplot(df1['price'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd5080ff390>
sns.distplot(df2['price'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd4fab244a8>
sns.distplot(df3['price'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd4fab38940>
sns.distplot(df4['price'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd4f1489c88>
plt.figure(figsize=(16,12))
sns.set()
sns.boxplot(data=df1, x='manufacturer', y=('log price'))
plt.xticks(rotation=90)
plt.ylim(0,14)
(0, 14)
We can see that there seems to be significant price range differences among car manufacturers. However, the range and variance within each manufacturer seems very wide. This is most probably due to many other influencing factors as each manufacturer has various models (resulting in large number of car types) and each model may have a very different specification. Hence we do not include the manufacturer and model as predictors to our model.
import numpy as np
plt.scatter(df1['year'],np.log(df1['price']),color='darkred')
plt.xlabel('year')
plt.ylabel('log price')
Text(0,0.5,'log price')
There doesn't seem a very clear relationship, but there seems to be an indication that the later the year, the more likely the car to have higher prices, and higher range of price.
# Using df2 that has MRSP column
plt.scatter(np.log(df2['MSRP']),np.log(df2['price']),color='darkred')
plt.xlabel('log MSRP')
plt.ylabel('log price')
Text(0,0.5,'log price')
plt.figure(figsize=(16,12))
sns.set()
sns.boxplot(data=df1, x='state', y=('log price'))
plt.xticks(rotation=90)
plt.ylim(0,14)
(0, 14)
plt.scatter(df1['state'],df1['log price'],color='darkblue')
plt.xlabel('state')
plt.ylabel('log price')
plt.xticks(rotation=90)
plt.show()
sns.distplot(df1['odometer'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd56ed51c88>
sns.distplot(df2['odometer'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd56ed4a6d8>
sns.distplot(df3['odometer'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd521ce8a90>
sns.distplot(df4['odometer'],hist=True,kde=True,rug=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd521ce8390>
plt.scatter(np.log(df1['odometer']),np.log(df1['price']),color='darkblue')
plt.xlabel('odometer')
plt.ylabel('log price')
Text(0,0.5,'log price')
import numpy as np
plt.scatter(np.log(df2['odometer']),np.log(df2['price']),color='darkblue')
plt.xlabel('odometer')
plt.ylabel('log price')
Text(0,0.5,'log price')
import numpy as np
plt.scatter(np.log(df3['odometer']),np.log(df3['price']),color='darkblue')
plt.xlabel('odometer')
plt.ylabel('log price')
Text(0,0.5,'log price')
import numpy as np
plt.scatter(np.log(df4['odometer']),np.log(df4['price']),color='darkblue')
plt.xlabel('odometer')
plt.ylabel('log price')
Text(0,0.5,'log price')
condition = pd.DataFrame(df1['condition'].value_counts())
condition
| condition | |
|---|---|
| good | 121456 |
| excellent | 101467 |
| like new | 21178 |
| fair | 6769 |
| new | 1305 |
| salvage | 601 |
condition = pd.DataFrame(df1['condition'].value_counts())
sizes = condition['condition'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(condition.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('condition')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
cylinders = pd.DataFrame(df1['cylinders'].value_counts())
sizes = cylinders['cylinders'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(cylinders.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('cylinders')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
fuel = pd.DataFrame(df1['fuel'].value_counts())
sizes = fuel['fuel'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(fuel.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('fuel')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
title_status = pd.DataFrame(df1['title_status'].value_counts())
sizes = title_status['title_status'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(title_status.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('title_status')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
transmission = pd.DataFrame(df1['transmission'].value_counts())
sizes = transmission['transmission'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(transmission.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('transmission')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
drive = pd.DataFrame(df1['drive'].value_counts())
sizes = drive['drive'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(drive.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('drive')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
size = pd.DataFrame(df1['size'].value_counts())
sizes = size['size'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(size.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('size')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
types = pd.DataFrame(df1['type'].value_counts())
sizes = types['type'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(types.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('type')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
df1['paint_color'].value_counts()
white 79285 black 62861 silver 42970 blue 31223 red 30473 grey 24416 green 7343 custom 6700 brown 6593 yellow 2142 orange 1984 purple 687 Name: paint_color, dtype: int64
paint_color = pd.DataFrame(df1['paint_color'].value_counts())
sizes = paint_color['paint_color'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(paint_color.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
colors = ['white','black','silver','blue','red','grey','green','darkred','brown','yellow','orange','purple']
plt.title('paint_color')
patches, texts = plt.pie(sizes, shadow=True, startangle=90, colors=colors)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
import matplotlib.pyplot as plt
state = pd.DataFrame(df1['state'].value_counts())
sizes = state['state'].values
percent = 100.*sizes/sizes.sum()
labels = labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(state.index, percent)]
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
# colors = ['red', 'blue', 'green', 'yellow']
plt.title('state')
patches, texts = plt.pie(sizes, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.show()
For any null values, we can try to impute them by matching the manufacturer or car model.
Alternatively, based on the distribution of categorical variables above, any remaining null values can be filled with the mode in each categorical variable.
We will try to cluster the cleaner dataset to understand the existing groupings of the data which can be seen as market segments of used car listings.
dft = df4.copy()
x = dft.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]].values
price = dft.iloc[:,[0]].values
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
model=TSNE(learning_rate=100, random_state=0)
Tsne_transformed=model.fit_transform(x)
Tsne_transformed.shape
(120987, 2)
Tsne_transformed[:,0].shape
(120987,)
Tsne_transformed[:,0]
array([ -6.474205 , 22.44806 , 4.394041 , ..., 17.16176 ,
-12.488963 , -0.76163894], dtype=float32)
Tsne_transformed[:,1].shape
(120987,)
price.shape
(120987, 1)
price
array([[22590],
[ 4500],
[14000],
...,
[ 7977],
[ 7477],
[ 5477]])
prices = price.reshape(120987,)
xs=Tsne_transformed[:,0]
ys=Tsne_transformed[:,1]
plt.scatter(x=xs,y=ys, c=prices)
<matplotlib.collections.PathCollection at 0x7fc8af0a0b00>
There doesn't seem to be a clear distinction between different groups/clusters.
dft = df4.copy()
x5 = dft.iloc[:, [5]].values
Tsne_transformed5 = model.fit_transform(x5)
price = dft.iloc[:,[0]].values
prices = price.reshape(120987,)
xs5=Tsne_transformed5[:,0]
ys5=Tsne_transformed5[:,1]
plt.scatter(x=xs5,y=ys5, c=prices)
<matplotlib.collections.PathCollection at 0x7f710a5546a0>
dft = df4.copy()
xd = dft.values
Tsne_transformedd = model.fit_transform(xd)
price = dft.iloc[:,[0]].values
prices = price.reshape(120987,)
xdt=Tsne_transformedd[:,0]
yd=Tsne_transformedd[:,1]
plt.scatter(x=xdt,y=yd, c=prices)
<matplotlib.collections.PathCollection at 0x7f7113d37128>
We can try other clustering method if we can find more meaningful/distinctive results.
dfb = df4.copy()
dfb2 = dfb.values
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
stscaler = StandardScaler().fit(dfb2)
X = stscaler.transform(dfb2)
from sklearn.neighbors import NearestNeighbors
#Finding epsilon
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)
#Plotting distances in order to find epsilon
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x7fd4e6848fd0>]
db = DBSCAN(eps=2, metric='euclidean', min_samples=15)
db.fit(dfb2)
set(db.labels_)
{-1,
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37}
db = DBSCAN(eps=2, metric='euclidean', min_samples=20)
db.fit(dfb2)
set(db.labels_)
{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
db = DBSCAN(eps=1, metric='euclidean', min_samples=15)
db.fit(dfb2)
set(db.labels_)
{-1,
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32}
db = DBSCAN(eps=1, metric='euclidean', min_samples=20)
db.fit(dfb2)
set(db.labels_)
{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8}
db = DBSCAN(eps=1, metric='euclidean', min_samples=30)
db.fit(dfb2)
set(db.labels_)
{-1}
db = DBSCAN(eps=1, metric='euclidean', min_samples=25)
db.fit(dfb2)
set(db.labels_)
{-1, 0, 1, 2, 3, 4, 5}
We will use eps=1 and min_samples=25 for 6 clusters + 1 outlier.
db.labels_.shape
(120987,)
dfbc = pd.DataFrame(np.concatenate((dfb2, db.labels_.reshape((120987, 1))), axis=1))
dfbc.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22590 | 2010 | 2 | 4 | 2 | 71229 | 0 | 2 | 0 | 8 | 1 | 46110 | 11 | 0 | 0 | -1 |
| 1 | 4500 | 1992 | 0 | 3 | 2 | 192000 | 0 | 0 | 0 | 0 | 1 | 25695 | 29 | 0 | 0 | -1 |
| 2 | 14000 | 2012 | 0 | 3 | 2 | 95000 | 0 | 0 | 1 | 5 | 1 | 37775 | 9 | 0 | 1 | -1 |
| 3 | 32990 | 2019 | 2 | 4 | 4 | 6897 | 0 | 2 | 0 | 8 | 1 | 38400 | 2 | 0 | 1 | -1 |
| 4 | 2100 | 2006 | 1 | 1 | 2 | 97000 | 0 | 0 | 0 | 4 | 1 | 21495 | 15 | 0 | 0 | -1 |
dfb.columns
Index(['price', 'year', 'condition', 'cylinders', 'fuel', 'odometer',
'title_status', 'transmission', 'drive', 'type', 'state', 'MSRP',
'car_age', 'is_vintage', 'is_color_neutral', 'c'],
dtype='object')
labels = db.labels_
core_samples = np.zeros_like(labels, dtype = bool)
core_samples[db.core_sample_indices_] = True
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0,1, len(unique_labels)))
for (label, color) in zip(unique_labels, colors):
class_member_mask = (labels == label)
xy = dfb2[class_member_mask & core_samples]
plt.plot(xy[:,5],xy[:,0], 'o', markerfacecolor = 'green', markersize = 15)
xy2 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy2[:,5],xy2[:,0], 'o', markerfacecolor = 'red', markersize = 25)
xy3 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy3[:,5],xy3[:,0], 'o', markerfacecolor = 'blue', markersize = 30)
xy4 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy4[:,5],xy4[:,0], 'o', markerfacecolor = 'black', markersize = 15)
xy5 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy5[:,5],xy5[:,0], 'o', markerfacecolor = 'yellow', markersize = 20)
xy6 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy6[:,5],xy6[:,0], 'o', markerfacecolor = 'orange', markersize = 15)
xy7 = dfb2[class_member_mask & ~core_samples]
plt.plot(xy7[:,5],xy7[:,0], 'o', markerfacecolor = 'grey', markersize = 5)
plt.title("DBSCAN")
plt.show()
The cluster separation is not clear. We will try another clustering method with K-Means.
dfk=df4.copy()
dfk2=dfk.values
from sklearn.cluster import KMeans
inertia = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++',
max_iter = 400, n_init = 10, random_state = 0)
kmeans.fit(dfk2)
inertia.append(kmeans.inertia_)
#Plotting the results onto a line graph to observe 'The elbow'
plt.plot(range(1, 11), inertia)
plt.title('Elbow Method')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.show()
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 500,
n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(dfk2)
y_kmeans.shape
(120987,)
kpc = pd.DataFrame(np.concatenate((dfk2, y_kmeans.reshape((120987,1))), axis=1))
kpc.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22590 | 2010 | 2 | 4 | 2 | 71229 | 0 | 2 | 0 | 8 | 1 | 46110 | 11 | 0 | 0 | 2 |
| 1 | 4500 | 1992 | 0 | 3 | 2 | 192000 | 0 | 0 | 0 | 0 | 1 | 25695 | 29 | 0 | 0 | 1 |
| 2 | 14000 | 2012 | 0 | 3 | 2 | 95000 | 0 | 0 | 1 | 5 | 1 | 37775 | 9 | 0 | 1 | 0 |
| 3 | 32990 | 2019 | 2 | 4 | 4 | 6897 | 0 | 2 | 0 | 8 | 1 | 38400 | 2 | 0 | 1 | 2 |
| 4 | 2100 | 2006 | 1 | 1 | 2 | 97000 | 0 | 0 | 0 | 4 | 1 | 21495 | 15 | 0 | 0 | 0 |
#Visualising the clusters: ODOMETER
plt.scatter(dfk2[:, 5],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e0bbfac8>
Cluster separation is quite distinctive. We will apply this cluster to the remaining variables.
#Visualising the clusters: YEAR
plt.scatter(dfk2[:, 1],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e0a1e160>
#Visualising the clusters: CONDITION
plt.scatter(dfk2[:, 2],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e03c7898>
#Visualising the clusters: CYLINDERS
plt.scatter(dfk2[:, 3],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e031a390>
dfk.columns
Index(['price', 'year', 'condition', 'cylinders', 'fuel', 'odometer',
'title_status', 'transmission', 'drive', 'type', 'state', 'MSRP',
'car_age', 'is_vintage', 'is_color_neutral'],
dtype='object')
#Visualising the clusters: FUEL
plt.scatter(dfk2[:, 4],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e02f1390>
#Visualising the clusters: TITLE_STATUS
plt.scatter(dfk2[:, 6],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e02c90b8>
#Visualising the clusters: TRANSMISSION
plt.scatter(dfk2[:, 7],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e01d66a0>
#Visualising the clusters: DRIVE
plt.scatter(dfk2[:, 8],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4dad70208>
#Visualising the clusters: TYPE
plt.scatter(dfk2[:, 9],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4dace2c18>
#Visualising the clusters: STATE
plt.scatter(dfk2[:, 10],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4dacb8e10>
#Visualising the clusters: MSRP
plt.scatter(dfk2[:, 11],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4dac6f320>
#Visualising the clusters: CAR AGE
plt.scatter(dfk2[:, 12],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e02a6f60>
#Visualising the clusters: VINTAGE
plt.scatter(dfk2[:, 13],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4e02424e0>
#Visualising the clusters: COLOR NEUTRAL
plt.scatter(dfk2[:, 14],dfk2[:, 0], c=y_kmeans, s=50, cmap='viridis')
<matplotlib.collections.PathCollection at 0x7fd4dab89fd0>
dfk.head()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | c | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22590 | 2010 | 2 | 4 | 2 | 71229 | 0 | 2 | 0 | 8 | 1 | 46110 | 11 | 0 | 0 | 2 |
| 1 | 4500 | 1992 | 0 | 3 | 2 | 192000 | 0 | 0 | 0 | 0 | 1 | 25695 | 29 | 0 | 0 | 1 |
| 2 | 14000 | 2012 | 0 | 3 | 2 | 95000 | 0 | 0 | 1 | 5 | 1 | 37775 | 9 | 0 | 1 | 0 |
| 3 | 32990 | 2019 | 2 | 4 | 4 | 6897 | 0 | 2 | 0 | 8 | 1 | 38400 | 2 | 0 | 1 | 2 |
| 4 | 2100 | 2006 | 1 | 1 | 2 | 97000 | 0 | 0 | 0 | 4 | 1 | 21495 | 15 | 0 | 0 | 0 |
In clustering below,
cluster0 = cluster1 in the slide/previous version. "The Mid Class"
cluster1 = cluster0 in the slide/previous version. "The Lower Class"
cluster2 = cluster2 in the slide/previous version. "The Higher Class"
cluster0 = dfk[dfk['c']==0]
cluster1 = dfk[dfk['c']==1]
cluster2 = dfk[dfk['c']==2]
cluster0.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | c | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.000000 | 52990.00000 | 52990.0 |
| mean | 12301.733497 | 2011.156728 | 1.090092 | 2.137347 | 2.050783 | 109635.347726 | 0.141480 | 0.084242 | 0.638422 | 5.769560 | 23.822136 | 30764.942517 | 9.843272 | 0.003001 | 0.50919 | 0.0 |
| std | 7327.905377 | 5.383192 | 0.906215 | 1.175694 | 0.386420 | 18584.315711 | 0.766181 | 0.322756 | 0.648144 | 4.279597 | 14.845214 | 9501.811415 | 5.383192 | 0.054696 | 0.49992 | 0.0 |
| min | 1090.000000 | 1955.000000 | 0.000000 | 0.000000 | 0.000000 | 74933.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2130.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 |
| 25% | 6995.000000 | 2009.000000 | 0.000000 | 1.000000 | 2.000000 | 94000.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 23995.000000 | 7.000000 | 0.000000 | 0.00000 | 0.0 |
| 50% | 10500.000000 | 2012.000000 | 1.000000 | 2.000000 | 2.000000 | 109130.000000 | 0.000000 | 0.000000 | 1.000000 | 8.000000 | 23.000000 | 29485.000000 | 9.000000 | 0.000000 | 1.00000 | 0.0 |
| 75% | 15918.000000 | 2014.000000 | 2.000000 | 3.000000 | 2.000000 | 125163.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 37170.000000 | 12.000000 | 0.000000 | 1.00000 | 0.0 |
| max | 79500.000000 | 2021.000000 | 5.000000 | 5.000000 | 4.000000 | 145000.000000 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 96200.000000 | 66.000000 | 1.000000 | 1.00000 | 0.0 |
cluster1.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | c | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 30259.00000 | 30259.000000 | 30259.000000 | 30259.00000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.000000 | 30259.0 |
| mean | 7376.17998 | 2007.492746 | 1.078522 | 2.32245 | 2.034932 | 177112.898510 | 0.120493 | 0.090519 | 0.581910 | 5.805612 | 23.795862 | 29881.027727 | 13.507254 | 0.000430 | 0.455567 | 1.0 |
| std | 5228.60893 | 4.739765 | 0.828140 | 1.17097 | 0.389145 | 27609.491568 | 0.721580 | 0.323014 | 0.640724 | 4.292616 | 14.538756 | 7822.175742 | 4.739765 | 0.020723 | 0.498030 | 0.0 |
| min | 1050.00000 | 1905.000000 | 0.000000 | 0.00000 | 0.000000 | 142682.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2200.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 25% | 3900.00000 | 2005.000000 | 0.000000 | 1.00000 | 2.000000 | 154769.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 10.000000 | 24340.000000 | 10.000000 | 0.000000 | 0.000000 | 1.0 |
| 50% | 5950.00000 | 2008.000000 | 1.000000 | 3.00000 | 2.000000 | 170000.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 22.000000 | 29485.000000 | 13.000000 | 0.000000 | 0.000000 | 1.0 |
| 75% | 9000.00000 | 2011.000000 | 2.000000 | 3.00000 | 2.000000 | 193088.500000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 34380.000000 | 16.000000 | 0.000000 | 1.000000 | 1.0 |
| max | 52000.00000 | 2021.000000 | 5.000000 | 5.00000 | 4.000000 | 267842.000000 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 65225.000000 | 116.000000 | 1.000000 | 1.000000 | 1.0 |
cluster2.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | c | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.000000 | 37738.0 |
| mean | 20638.106285 | 2014.639833 | 0.785574 | 2.023716 | 2.044782 | 44520.446844 | 0.234697 | 0.111267 | 0.739520 | 5.569665 | 23.387116 | 33281.779400 | 6.360167 | 0.008241 | 0.529811 | 2.0 |
| std | 10368.349215 | 6.916061 | 1.091136 | 1.187798 | 0.365827 | 21311.031968 | 0.973457 | 0.395934 | 0.702538 | 4.262150 | 15.182442 | 11776.894719 | 6.916061 | 0.090406 | 0.499117 | 0.0 |
| min | 1095.000000 | 1946.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2905.000000 | 0.000000 | 0.000000 | 0.000000 | 2.0 |
| 25% | 13725.000000 | 2014.000000 | 0.000000 | 1.000000 | 2.000000 | 28529.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 25030.000000 | 3.000000 | 0.000000 | 0.000000 | 2.0 |
| 50% | 18995.000000 | 2016.000000 | 0.000000 | 1.000000 | 2.000000 | 46070.000000 | 0.000000 | 0.000000 | 1.000000 | 8.000000 | 23.000000 | 30775.000000 | 5.000000 | 0.000000 | 1.000000 | 2.0 |
| 75% | 25999.000000 | 2018.000000 | 2.000000 | 3.000000 | 2.000000 | 63061.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 37.000000 | 41800.000000 | 7.000000 | 0.000000 | 1.000000 | 2.0 |
| max | 89950.000000 | 2021.000000 | 5.000000 | 6.000000 | 4.000000 | 84700.000000 | 5.000000 | 2.000000 | 2.000000 | 12.000000 | 50.000000 | 102100.000000 | 75.000000 | 1.000000 | 1.000000 | 2.0 |
print(cluster0['year'].mode())
print(cluster1['year'].mode())
print(cluster2['year'].mode())
0 2013 dtype: int64 0 2008 dtype: int64 0 2018 dtype: int64
print(cluster0['condition'].mode())
print(cluster1['condition'].mode())
print(cluster2['condition'].mode())
0 2 dtype: int64 0 1 dtype: int64 0 0 dtype: int64
print(cluster0['cylinders'].mode())
print(cluster1['cylinders'].mode())
print(cluster2['cylinders'].mode())
0 1 dtype: int64 0 3 dtype: int64 0 1 dtype: int64
print(cluster0['fuel'].mode())
print(cluster1['fuel'].mode())
print(cluster2['fuel'].mode())
0 2 dtype: int64 0 2 dtype: int64 0 2 dtype: int64
print(cluster0['title_status'].mode())
print(cluster1['title_status'].mode())
print(cluster2['title_status'].mode())
0 0 dtype: int64 0 0 dtype: int64 0 0 dtype: int64
print(cluster0['transmission'].mode())
print(cluster1['transmission'].mode())
print(cluster2['transmission'].mode())
0 0 dtype: int64 0 0 dtype: int64 0 0 dtype: int64
print(cluster0['drive'].mode())
print(cluster1['drive'].mode())
print(cluster2['drive'].mode())
0 0 dtype: int64 0 0 dtype: int64 0 1 dtype: int64
print(cluster0['type'].mode())
print(cluster1['type'].mode())
print(cluster2['type'].mode())
0 9 dtype: int64 0 0 dtype: int64 0 9 dtype: int64
print(cluster0['state'].mode())
print(cluster1['state'].mode())
print(cluster2['state'].mode())
0 4 dtype: int64 0 4 dtype: int64 0 4 dtype: int64
dfcar2 = pd.read_csv('/home/asyabani/dfcar2.csv')
dfcar2.head()
| Unnamed: 0 | price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | ... | match_score | manufacturer_msrp | model_msrp | year_msrp | cylinders_msrp | vstyle_msrp | MSRP | wd_msrp | trans_msrp | quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22590 | 2010.0 | good | 8.0 | gas | 71229.0 | clean | other | 4wd | ... | 1.0 | Chevrolet | Silverado 1500 | 2015.0 | 8.0 | Crew Cab Pickup | 46110.0 | 4WD | AUTOMATIC | Q2 |
| 1 | 7 | 35000 | 2019.0 | excellent | 6.0 | gas | 43000.0 | clean | automatic | 4wd | ... | 2.0 | Toyota | Tacoma | 2015.0 | 6.0 | Crew Cab Pickup | 27355.0 | 4WD | MANUAL | Q2 |
| 2 | 10 | 4500 | 1992.0 | excellent | 6.0 | gas | 192000.0 | clean | automatic | 4wd | ... | 1.0 | Jeep | Cherokee | 2017.0 | 4.0 | SUV | 25695.0 | 4WD | NaN | Q2 |
| 3 | 19 | 0 | 2011.0 | excellent | 4.0 | gas | 99615.0 | clean | automatic | 4wd | ... | 1.0 | Jeep | Compass | 2017.0 | 4.0 | SUV | 21940.0 | 4WD | NaN | Q2 |
| 4 | 32 | 14000 | 2012.0 | excellent | 6.0 | gas | 95000.0 | clean | automatic | fwd | ... | 1.0 | Honda | Odyssey | 2015.0 | 6.0 | Passenger Minivan | 37775.0 | FWD | AUTOMATIC | Q2 |
5 rows × 26 columns
dfdep_man = dfcar2.groupby(['manufacturer_msrp']).mean().reset_index()
dfdep_man['depreciation']=dfdep_man['price']-dfdep_man['MSRP']
dfdep_man['depr_percent']=dfdep_man['depreciation']/dfdep_man['MSRP']*100
# Top 10 higher depreciation by manufacturer.
dfdep_man[['manufacturer_msrp','price','MSRP','depreciation','depr_percent']].sort_values(by=['depr_percent']).head(10)
| manufacturer_msrp | price | MSRP | depreciation | depr_percent | |
|---|---|---|---|---|---|
| 4 | Cadillac | 14921.812517 | 55168.749664 | -40246.937147 | -72.952419 |
| 8 | FIAT | 7503.598174 | 26718.287671 | -19214.689498 | -71.915872 |
| 19 | Lincoln | 13101.868785 | 46561.060994 | -33459.192209 | -71.860889 |
| 0 | Acura | 11555.282073 | 40621.912065 | -29066.629992 | -71.554067 |
| 30 | Volvo | 11192.418291 | 38049.733133 | -26857.314843 | -70.584765 |
| 24 | Pontiac | 6886.864214 | 23412.381940 | -16525.517726 | -70.584521 |
| 21 | Mercedes-Benz | 16584.009404 | 56324.915361 | -39740.905956 | -70.556530 |
| 2 | BMW | 16398.980255 | 54319.284933 | -37920.304678 | -69.810022 |
| 14 | Infiniti | 13342.351323 | 43829.781526 | -30487.430202 | -69.558709 |
| 1 | Audi | 16262.317232 | 52912.487153 | -36650.169921 | -69.265634 |
dfdep_mod = dfcar2.groupby(['model_msrp','manufacturer_msrp']).mean().reset_index()
dfdep_mod.columns
Index(['model_msrp', 'manufacturer_msrp', 'Unnamed: 0', 'price', 'year',
'cylinders', 'odometer', 'match_score', 'year_msrp', 'cylinders_msrp',
'MSRP'],
dtype='object')
dfdep_mod['depreciation']=dfdep_mod['price']-dfdep_mod['MSRP']
dfdep_mod['depr_percent']=dfdep_mod['depreciation']/dfdep_mod['MSRP']*100
# Top 10 higher depreciation by model (-100% depreciation anomaly is removed)
dfdep_mod[['model_msrp','manufacturer_msrp','price','MSRP','depreciation',
'depr_percent']].sort_values(by=['depr_percent']).head(14).iloc[4:]
| model_msrp | manufacturer_msrp | price | MSRP | depreciation | depr_percent | |
|---|---|---|---|---|---|---|
| 614 | Sierra 1500 Hybrid | GMC | 1599.400000 | 45425.0 | -43825.600000 | -96.479031 |
| 352 | Intrepid | Dodge | 1454.166667 | 27055.0 | -25600.833333 | -94.625146 |
| 483 | Phaeton | Volkswagen | 3481.666667 | 64600.0 | -61118.333333 | -94.610423 |
| 733 | XC | Volvo | 2000.000000 | 36500.0 | -34500.000000 | -94.520548 |
| 112 | CL-Class | Mercedes-Benz | 12722.500000 | 211000.0 | -198277.500000 | -93.970379 |
| 723 | Windstar | Ford | 2273.240000 | 31115.0 | -28841.760000 | -92.694070 |
| 455 | Neon | Dodge | 1491.625000 | 19450.0 | -17958.375000 | -92.330977 |
| 499 | Q45 | Infiniti | 4911.250000 | 61600.0 | -56688.750000 | -92.027192 |
| 374 | LS 600h L | Lexus | 9629.266667 | 120060.0 | -110430.733333 | -91.979621 |
| 477 | Park Avenue | Buick | 3233.470588 | 39725.0 | -36491.529412 | -91.860364 |
print(df3['price'].mean())
print(df3['MSRP'].mean())
print(df3['price'].mean()-df3['MSRP'].mean())
print((df3['price'].mean()-df3['MSRP'].mean())/df3['MSRP'].mean()*100)
13667.702250394625 31342.14561863125 -17674.443368236625 -56.39193813754123
# Latest dataset
print(df4['price'].mean())
print(df4['MSRP'].mean())
print(df4['price'].mean()-df4['MSRP'].mean())
print((df4['price'].mean()-df4['MSRP'].mean())/df4['MSRP'].mean()*100)
13670.109540694455 31328.920569978593 -17658.811029284137 -56.36584570425946
Visualizing longitude and latitude data from the raw data.
We can see that the data is not clean since it's supposed to cover US area only.
These variables are not used as predictors.
import plotly.express as px
import pandas as pd
fig = px.scatter_geo(df1,lat='lat',lon='long', hover_name="id")
fig.update_layout(title = 'World map', title_x=0.5)
fig.show()